---
title: "Replication"
output: html_document
date: "2023-03-19"
---
Variable description:
cutter: a jieba R cutter contains a dictionary of Chinese stopwords and user-defined words.
covid_words: a dictionary of COVID-19 related words, in Appendix 2.
weibo: Original Weibo data.
weibo_paragraphs: Weibo data split into paragraphs.
weibo_data_covid: Weibo paragraphs with COVID-19 related words.
segword_paragraph: segmented COVID-19 related  weibo_paragraphs(list).
segword_combined_paragraph: segmented COVID-19 related weibo_paragraphs(character).
covid_corpus_paragraph: a quanteda corpus of COVID-19 related weibo_paragraphs.
token_covid_paragraph: a quanteda token of COVID-19 related weibo_paragraphs.
covid_dfm_trim_paragraph: a quanteda dfm of COVID-19 related weibo_paragraphs, removing empty rows to condcut LDA.
token_covid_paragraph_trim: a quanteda token of COVID-19 related weibo_paragraphs, removing empty rows to condcut LDA.
merge_date_use_old: metadata on daily basis collected from several different resources, including Baidu Index, OXCGRT and other data sources. Not all of them are used in following analysis. 
OxCGRT_Index_date: Strigency data in OxCGRT.
OxCGRT_all: all OxCGRT data.

```{r cars, echo=F,warning=FALSE,message=FALSE}
library(readr)
library(devtools)
library(quanteda.textplots)
library(tm)
library(tmcn)
library(tidytext)
library(jiebaRD)
library(jiebaR)
library(dplyr)
library(tidyr)
library(quanteda)
#library(corpustools)
library(ggplot2)
#require(pals)
library(stringr)
library(lubridate)
library(readtext)
library(topicmodels)
library(writexl)
library(RColorBrewer)
library(text2vec)
library(LDAvis)
library(TSstudio)
#rm(list=ls())
```

```{r}
rm(list = ls())
load("replication_load_data.rdata")
```

# data process
## generate socio-economic variables by region 
```{r}
OxCGRT_all$date <-sub("(.{3})(.)(.{1})(.)(.{1})(.*)", "\\1\\2-\\3\\4-\\5\\6", OxCGRT_all$Date)
OxCGRT_all$date=as.Date(OxCGRT_all$date)

OxCGRT<-OxCGRT_all[OxCGRT_all$date< "2022-11-11" & OxCGRT_all$date> "2021-10-01",]
OxCGRT$week_num=format(OxCGRT$date, "%Y-%V") 
```

```{r}
OxCGRT_east<-OxCGRT[OxCGRT$RegionName==c("Beijing", "Tianjin", "Hebei", "Shandong", "Jiangsu", "Shanghai", "Zhejiang", "Fujian", "Guangdong", "Hainan"),]
OxCGRT_northeast<-OxCGRT[OxCGRT$RegionName==c("Heilongjiang", "Jilin", "Liaoning"),]
OxCGRT_central<-OxCGRT[OxCGRT$RegionName==c("Shanxi", "Henan", "Hubei", "Hunan", "Jiangxi", "Anhui"),]
OxCGRT_west<-OxCGRT[OxCGRT$RegionName==c("Sichuan", "Guangxi", "Guizhou", "Yunnan", "Chongqing", "Shaanxi", "Gansu", "Inner Mongolia", "Ningxia", "Xinjiang", "Qinghai", "Tibet"),]

# gen diff_value as increased data each day
OxCGRT_east <-  OxCGRT_east[order(OxCGRT_east$date), ]
OxCGRT_east$new_cases <- c(NA, diff(OxCGRT_east$ConfirmedCases))
OxCGRT_east$new_deaths <- c(NA, diff(OxCGRT_east$ConfirmedDeaths))

OxCGRT_central <-  OxCGRT_central[order(OxCGRT_central$date), ]
OxCGRT_central$new_cases <- c(NA, diff(OxCGRT_central$ConfirmedCases))
OxCGRT_central$new_deaths <- c(NA, diff(OxCGRT_central$ConfirmedDeaths))

OxCGRT_west <-  OxCGRT_west[order(OxCGRT_west$date), ]
OxCGRT_west$new_cases <- c(NA, diff(OxCGRT_west$ConfirmedCases))
OxCGRT_west$new_deaths <- c(NA, diff(OxCGRT_west$ConfirmedDeaths))

OxCGRT_northeast <-  OxCGRT_northeast[order(OxCGRT_northeast$date), ]
OxCGRT_northeast$new_cases <- c(NA, diff(OxCGRT_northeast$ConfirmedCases))
OxCGRT_northeast$new_deaths <- c(NA, diff(OxCGRT_northeast$ConfirmedDeaths))
```

merge socio-economic data to week data 
```{r}
OxCGRT_east<-aggregate(OxCGRT_east[,c(64:65,55)], by = list(OxCGRT_east$week_num),FUN="mean") 
OxCGRT_east<-OxCGRT_east[OxCGRT_east$Group.1!="2022-52",]
colnames(OxCGRT_east)<-c('week_num',"New_cases_east","New_deaths_east","StringencyIndex_east")

OxCGRT_west<-aggregate(OxCGRT_west[,c(64:65,55)], by = list(OxCGRT_west$week_num),FUN="mean") 
OxCGRT_west<-OxCGRT_west[OxCGRT_west$Group.1!="2022-52",]
colnames(OxCGRT_west)<-c('week_num',"New_cases_west","New_deaths_west","StringencyIndex_west")

OxCGRT_central<-aggregate(OxCGRT_central[,c(64:65,55)], by = list(OxCGRT_central$week_num),FUN="mean") 
OxCGRT_central<-OxCGRT_central[OxCGRT_central$Group.1!="2022-52",]
colnames(OxCGRT_central)<-c('week_num',"New_cases_Central","New_deaths_Central","StringencyIndex_Central")

OxCGRT_northeast<-aggregate(OxCGRT_northeast[,c(64:65,55)], by = list(OxCGRT_northeast$week_num),FUN="mean") 
OxCGRT_northeast<-OxCGRT_northeast[OxCGRT_northeast$Group.1!="2022-52",]
colnames(OxCGRT_northeast)<-c('week_num',"New_cases_northeast","New_deaths_northeast","StringencyIndex_northeast")

OxCGRT_4region=full_join(OxCGRT_east,OxCGRT_central,by="week_num") %>% full_join(.,OxCGRT_northeast,by="week_num") %>% full_join(.,OxCGRT_west,by="week_num")
OxCGRT_4region$week_num_1<- as.factor(OxCGRT_4region$week_num)
```


# topic modeling (k=125)
```{r}
lda.1gram.125<-LDA(token_covid_paragraph_trim, k = 125, method = "Gibbs", control = list(verbose=25L, seed = 123, burnin = 125, iter = 125)) 

top5termsPerTopic.1gram.125 <-terms(lda.1gram.125, 10)
covid.topics.1gram.125 <- topicmodels::topics(lda.1gram.125, 1)
covid.terms.1gram.125 <- as.data.frame(topicmodels::terms(lda.1gram.125, 20), stringsAsFactors = FALSE)

tmResult <- posterior(lda.1gram.125)

theta <- tmResult$topics
dim(theta)               # nDocs(DTM) distributions over K topics
weibo_data_covid<-token_covid_paragraph_trim@docvars

weibo_data_covid$topic1_125=get_topics(lda.1gram.125)

weibo_data_covid_topic<-cbind(weibo_data_covid,theta)

#output keyterm of each topic
get_terms(lda.1gram.125,20) %>%  paste(., collapse = ",") %>%   writeLines(., "keyterm.txt") 
```
### remove topics which the max theta is too small
```{r}
theta_process=theta %>% as.data.frame()

# get theta for used topics
theta_process=theta_process[,c(21,107,79,43,57,35,
                               61,47,46,106,
                               52,105,11,30,
                               45,12)]

topicname.title.125<-
  c("Show Institutional Superiority \n of China’s Covid-19 Policy",
    "Show CPC’s Achievements in \n Combating Covid-19",
    "Show Chinese Contributions \n to Global Covid-19 Prevention",
    "Show China’s Actions in \n International Cooperation",
    "Show Failures in \n Western Response",
    "Show Western Politicians \n Getting Infected",
    
    "Disclose Central Government's \n Policy Arrangements",
    "Disclose Health Departments’ \n Policy Arrangements",
    "Disclose Punishment for Inadequate \n Covid-19 Prevention or Illegal Activities",
    "Disclose Personnel Changes",
   
   "Respond to Citizens’ \n Impacted Interest",
   "Respond to Citizens’ \n Mental Health Problems",
   "Respond to Government \n Expenditure",
   "Respond to Public \n Outrage in Escalating Measures",
    
   "Urge Resuming Work \n and Production",
   "Urge Macroeconomic \n Development"
   )

colnames(theta_process)<-topicname.title.125
```

```{r}
#set a threshold (decided by researchers)
thresholds_125 <- c(0.04,0.05,0.05,0.05,0.04,0.05,
                    0.03,0.04,0.04,0.04,
                    0.05,0.065,0.1,0.08,
                    0.05,0.05)

# thresholds_125 <- c(0.05,0.05,0.05,0.05,0.05,0.05,
#                     0.05,0.05,0.05,0.05,
#                     0.05,0.05,0.05,0.05,
#                     0.05,0.05)


theta_process_threshold<- theta_process
for (i in 1:ncol(theta_process_threshold)) {
  condition <- !is.na(theta_process_threshold[, i]) & (theta_process_threshold[, i] < thresholds_125[i])
  theta_process_threshold[condition, i] <- 0
}
head(theta_process_threshold)
```

```{r}
# get topic proportion per day
topic_proportion_per_day_125 <- aggregate(theta_process_threshold, by = list(weibo_data_covid$日期),FUN="mean")

# set topic names to aggregated columns
colnames(topic_proportion_per_day_125)[1]<- "date"
```


## merge socio-economic data
```{r}
#merge average strigency index at local level
merge_date_use_0126<-dplyr::left_join(merge_date_use_0126,OxCGRT_Index_date[,c(2:4)],by="date")

#merge topic data: after filter by threshold 
merge_date_use_0126<-dplyr::full_join(merge_date_use_0126,topic_proportion_per_day_125,by="date")

#ts data_all time
ts_merge_date_use_date=ts(merge_date_use_0126, 
   freq=365.25, 
   start=decimal_date(ymd("2021-10-02")))

merge_date_use_0126$week_all=format(merge_date_use_0126$date, "%Y-Week %V") #generate week data
merge_date_use_0126$week_num=format(merge_date_use_0126$date, "%Y-%V") #generate week data

merge_week_use_0126<- aggregate(merge_date_use_0126, by = list(merge_date_use_0126$week_num),FUN = function(x) mean(x, na.rm = TRUE))


############################ subset ts time during zero-covid 
merge_date_use_0126_zero<-merge_date_use_0126[merge_date_use_0126$date<"2022-11-11" & merge_date_use_0126$date>="2021-11-01",]
merge_date_use_0126_zero<-merge_date_use_0126_zero[is.na(merge_date_use_0126_zero$date) == FALSE,]
merge_date_use_0126_zero

colnames(merge_date_use_0126_zero)

## aggregate subtype of discourse
merge_date_use_0126_zero$ideology<- rowSums(merge_date_use_0126_zero[,c(24:29)])
merge_date_use_0126_zero$imperative<- rowSums(merge_date_use_0126_zero[,c(30:33)])
merge_date_use_0126_zero$communicative<-  rowSums(merge_date_use_0126_zero[,c(34:37)])
merge_date_use_0126_zero$directive<-  rowSums(merge_date_use_0126_zero[,c(38:39)])

ts_merge_date_use_zero=ts(merge_date_use_0126_zero, 
   freq=365.25, 
   start=decimal_date(ymd("2021-11-01")))

# gen week data
merge_week_0126_zero<- aggregate(merge_date_use_0126_zero, by = list(merge_date_use_0126_zero$week_num),FUN = function(x) mean(x, na.rm = TRUE))

# gen used data
lm_merge_week_0126_zero<- merge_week_0126_zero[1:54,]
lm_merge_week_0126_zero<- merge(lm_merge_week_0126_zero,OxCGRT_4region, by.x ="Group.1", by.y="week_num", all.x = TRUE )

date_week <- OxCGRT[,c("date","week_num")] %>%
  dplyr::group_by(week_num) %>%
  dplyr::summarise(min_date = min(date))

lm_merge_week_0126_zero <-  merge(lm_merge_week_0126_zero,date_week, by.x ="Group.1", by.y = "week_num"  , all.x = TRUE)
colnames(merge_week_0126_zero)
#rename variable to English
colnames(lm_merge_week_0126_zero)[1]<- "week_num"
colnames(lm_merge_week_0126_zero)[3]<- "National Stringency Index"
colnames(lm_merge_week_0126_zero)[4]<- "Vaccine Doses (per 100)"
colnames(lm_merge_week_0126_zero)[5]<- "New Cases (per 100000)"
colnames(lm_merge_week_0126_zero)[6]<- "New Deaths (per 100000)"
colnames(lm_merge_week_0126_zero)[7]<- "New Vaccine Doses (per 100)"
colnames(lm_merge_week_0126_zero)[8]<- "Expenditure on basic medical insurance(Month)"
colnames(lm_merge_week_0126_zero)[9]<- "Fiscal Expenditure (Month)"
colnames(lm_merge_week_0126_zero)[10]<- "Local government debt balance (Month)"
colnames(lm_merge_week_0126_zero)[11]<- "National Urban Survey Unemployment Rate (Month)"
colnames(lm_merge_week_0126_zero)[12]<- "Year-on-year growth in industrial value added (Month)"
colnames(lm_merge_week_0126_zero)[16]<- "National Stringency Index (Input)"
 
write.csv(lm_merge_week_0126_zero, "lm_merge_week_0302_manual_threshold.csv")
```
# Manuscirpt

## Table 2: Media list
```{r}
#output media data
table(weibo_data_covid$来源)%>% as.data.frame()%>%  write_xlsx(.,"D:\\坚果云\\我的坚果云\\Nutstore\\PHD\\2023春季\\社交媒体项目\\replication0302\\Table2.xlsx")
```

## Table 3: description analysis of socio-economic variables
```{r}
library(summarytools)
#by date
colnames(merge_date_use_0126_zero)
describe_table3<-lm_merge_week_0126_zero[,c(22,4,5,3,6,16)]
colnames(describe_table3)<-c("Stringency Index","New Cases (per 100000)","New Deaths (per 100000)","Vaccine Doses (per 100)","New Vaccine Doses (per 100)","Medical Pressure")
summarytools::view(dfSummary(describe_table3))

```

## Table 4
```{r}
topic_proportions_selected_topics <-  colSums(theta_process)/sum(theta)

topic_proportions_selected_topics<-as.data.frame(topic_proportions_selected_topics)
topic_proportions_selected_topics_threshold <-  colSums(theta_process_threshold)/sum(theta)
topic_proportions_selected_topics_threshold= as.data.frame(topic_proportions_selected_topics_threshold)

Table4<-cbind(colnames(theta_process) %>% as.data.frame(),topic_proportions_selected_topics,topic_proportions_selected_topics_threshold) 
write_xlsx(Table4,"Table 4.xlsx")
```


## Figure 3: Evolution of Political Discourse peer Week
```{r}
discourse<-merge_week_0126_zero[c(2,43:46)]
colnames(discourse) <-c("date","Ideology Discourse","Imperative Discourse","Directive Discourse","Communicative Discourse")

vizDataFrame_merge_week_use_0126_4discourse <- reshape2::melt(discourse, id.vars = "date")
vizDataFrame_merge_week_use_0126_4discourse$date<-as.Date(vizDataFrame_merge_week_use_0126_4discourse$date)

colnames(vizDataFrame_merge_week_use_0126_4discourse)[2]<- "Discourse"
jpeg(file="Figure 3.jpeg",width=3200, height=1500,res=300)

ggplot(vizDataFrame_merge_week_use_0126_4discourse, aes(x=date, y=value,color=Discourse, linetype =Discourse)) + 
  geom_line() + theme_minimal()+
  labs(y="the Proportions of Discourse", x="Date")+
  theme(text = element_text(size = 12))+
  theme(axis.text.x = element_text( hjust = 1,size=12),
        legend.text = element_text(size=12),
        legend.position = "right") +  scale_x_date(date_labels = "%Y-%m-%d")+  scale_color_grey(start = 0.8, end = 0 ) +
  guides(color = guide_legend(override.aes = list()))
dev.off()
```
## Figure 4: Evolution of Stringency Index by Region
```{r}
strigency_index_region<-lm_merge_week_0126_zero[c(49,52,55,58,60)]
colnames(strigency_index_region) <-c("Stringency Index in East","Stringency Index in Central","Stringency Index in Northeast","Stringency Index in West","min_date" )

vizDataFrame_merge_week_use_0126_substring <- reshape2::melt(strigency_index_region, id.vars = "min_date")
vizDataFrame_merge_week_use_0126_substring$min_date<-as.Date(vizDataFrame_merge_week_use_0126_substring$min_date)
colnames(vizDataFrame_merge_week_use_0126_substring)<-c("min_date","Stringency Index by Region","value")

jpeg(file="Figure 4.jpeg",width=3500, height=2000,res=300)
ggplot(vizDataFrame_merge_week_use_0126_substring, aes(x = min_date, y = value, color = `Stringency Index by Region`, linetype = `Stringency Index by Region`)) + 
  geom_line() + 
  theme_minimal() + 
  labs(y = "Value of Stringency Index", x = "date") +
  theme(text = element_text(size = 12)) +
  theme(axis.text.x = element_text(hjust = 1, size = 12),
        legend.text = element_text(size = 12),
        legend.position = "right") +
  scale_x_date(date_labels = "%Y-%m-%d") +
  scale_color_grey(start = 0.8, end = 0 ) +
  guides(color = guide_legend(override.aes = list()))
dev.off()
```
## lm: for convenience, lm is conducted by stata with lm_merge_week_0126_zero
```{r}
write_xlsx(lm_merge_week_0126_zero, "lm_merge_week_0126_zero.xlsx")
write.csv(lm_merge_week_0126_zero, "lm_merge_week_0126_zero.csv")
```

# Appendix
## Table A 1: description analysis of socio-economic variables
```{r}
library('summarytools')
#by week
describe_tableA1<-merge_date_use_0126_zero[,c(22,4,5,3,6,16)]
colnames(describe_tableA1)<-c("Stringency Index","New Cases (per 100000)","New Deaths (per 100000)","Vaccine Doses (per 100)","New Vaccine Doses (per 100)","Medical Pressure")
view(dfSummary(describe_tableA1))
```

## Appendix 2: A COVID-19 Dictionary
```{r}
#output dictionary
covid_words %>% paste(., collapse = "，") %>%  writeLines(., "Appendix 2.txt") 
```

## Figure A2: Evolutions of China's Socioeconomic Indicators Related to the COVID-19 During the Infectious Period of the COVID-19 Omicron Variant (October 2021 - November 2022)
```{r}
# change names of ts_merge_date_use_zero
#colnames(merge_date_use_0126_zero[,c(22,4,5,3,6,9)]) 
data_figureA2<-merge_date_use_0126_zero[,c(22,4,5,3,6,9)]
colnames(data_figureA2)<-c("Stringency Index", 
    "New Cases (per hundred thousand)",
    "New Deaths (per hundred thousand)",
    "Vaccine Doses (per hundred)",
    "New Vaccine Doses (per hundred)",
    "Local government debt balance")

ts_figureA2=ts(data_figureA2, 
   freq=365.25, 
   start=decimal_date(ymd("2021-11-01")))
```

```{r}
jpeg(file="Figure A. 2.jpeg",width=3000, height=2000,res=200)
par(cex.axis = 1)  # Adjust the value as needed for the desired text size
plot(ts_figureA2,main = " ", cex.main = 1.5, adj = 0.5)
grid(lwd =3)
# Adjust y-axis text size and angle
#,cex=0.8, cex.main=0.8, cex.lab=0.8, cex.axis=0.8
dev.off()
```
## Figure A3-A6: plot of 125 topic(time series)
```{r}
jpeg(file="Figure A. 3.jpeg",width=4000, height=3000,res=300)
plot(ts_merge_date_use_zero[,c(22,24:29)],main = " ",cex=0.8, cex.main=0.8, cex.lab=0.8, cex.axis=0.8)
grid(lwd =3)
dev.off()

jpeg(file="Figure A. 4.jpeg",width=4000, height=3000,res=300)
par(mar = c(15, 15, 14,12))
plot(ts_merge_date_use_zero[,c(22,30:33)],main = " ",cex=0.8, cex.main=0.8, cex.lab=0.8, cex.axis=0.8)
grid(lwd =3)
dev.off()

jpeg(file="Figure A. 5.jpeg",width=4000, height=3000,res=300)
plot(ts_merge_date_use_zero[,c(22,38:39)],main = " ",cex=0.8, cex.main=0.8, cex.lab=0.8, cex.axis=0.8)
grid(lwd =3)
dev.off()

jpeg(file="Figure A. 6.jpeg",width=4000, height=3000,res=300)
plot(ts_merge_date_use_zero[,c(22,34:37)],main = " ",cex=0.8, cex.main=0.8, cex.lab=0.8, cex.axis=0.8)
grid(lwd =3)
dev.off()
```

```{r}
save(list=ls(),file="replication_result.rdata")
```